* --------------------------------------------------------------------------------------
* This script cleans data from the SSA
*
* SSA Disability Analysis File Public Use File
* https://www.ssa.gov/disabilityresearch/daf_puf.html
* --------------------------------------------------------------------------------------

* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"

* ABOUT THE SSA DAF PUF

* The DAF is a set of files containing SSA administrative data on federal disability beneficiaries, culled from a variety of SSA sources. 
* In particular, it contains data related to program participation and benefits for Social Security Disability Insurance (SSDI) and Supplemental 
* Security Income (SSI) beneficiaries between the ages of 18 and Full Retirement Age (FRA) who have received disability benefits in any month 
* since 1996. It also includes data on selected beneficiaries aged 10 to 17 who have received benefits since 2005. There are millions of disability 
* beneficiaries in each year of the DAF.
* 
* This PUF contains a random 10 percent (10%) sample of beneficiaries included in the DAF.


* --------------------------------------------------------------------------------------
* Clean SSA DAF PUF annual (ANN) files: 
*   Output 1: Cleaned, converted from annual (wide) to monthly (long) format
*   Output 2: Summary file with one observation per pufpin
* --------------------------------------------------------------------------------------
forvalues year = 1994/2018 {
  *local year  1994
  noisily di "Processing SSA DAF18 PUF annual file for `year'"
  
  local y = substr("`year'", 3, 4)

  * Load SSA DAFPUF annual file
  local keepvars pufpin MEDR_PUF`y'?? EDX_PUF`y' REGION_PUF`y'
  use `keepvars' using "$SSDIMed/data/raw/ssa/DAF18/daf18_ann`y'puf_stata.dta", clear

  * year of SSA DAFPUF annual file
  gen year = `year'

  * rename variables
  rename EDX_PUF`y' edx_puf
  rename REGION_PUF`y' region_puf
  rename MEDR_PUF`y'?? medr_puf??

  * reshape long: individual-monthly observations
  greshape long medr_puf, by(pufpin) keys(month) string
  destring month, replace

  * QC
  gisid pufpin month
  assert inlist(medr_puf, 0, 1) if !missing(medr_puf)

  * Sort and save
  sort pufpin year month 
  compress _all 
  cap mkdir "$SSDIMed/data/proc/ssa"
  cap mkdir "$SSDIMed/data/proc/ssa/DAF18"
  save "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann`year'_long.dta", replace
}

* append annual files
clear
forvalues year = 1994/2018 {
  append using "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann`year'_long.dta"
}

* For education and region, take the first nonmissing value 
* For Medicare, calculate the first month of SSDI eligibility
gen covstart_month = ym(year, month) if medr_puf == 1
sort pufpin year month
gcollapse (min) covstart_month (firstnm) edx_firstnm = edx_puf region_firstnm = region_puf, by(pufpin) labelformat(#sourcelabel#)
gen covstart_year = year(dofm(covstart_month))
tab covstart_year

* Variable and value formatting and labels
label var covstart_year "Initial year of SSDI Medicare eligibility (DAFPUF)"
label var covstart_month "Initial month of SSDI Medicare eligibility (DAFPUF)"
format covstart_month %tm 
label var edx_firstnm "Education level, first nonmissing (edx_puf)"
label define edx 1 "less than high school" 2 "high school" 3 "some college" 4 "college or more" 
label values edx_firstnm edx 
label var region_firstnm "Region, first nonmissing (region_puf)"

* Sort, order save
bys pufpin: assert _N == 1
compress
order pufpin covstart_month
save "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann_summary.dta", replace


* --------------------------------------------------------------------------------------
* Clean SSA DAF PUF demographic (DMG) files:
*   Aim: bring DAF PUF sample as close as possible to Medicare sample
* --------------------------------------------------------------------------------------

use "$SSDIMed/data/raw/ssa/DAF18/daf18_dmgpuf_stata.dta", clear  
rename *, lower 

* DAF PUF comparison sample: 
*   - SSDI entitled
*   - Lived to Medicare eligibility
*   - First month of Medicare eligibility observed,
*   - First month of Medicare eligibility prior to age 65
*   - Nonmissing on key variables

* Keep if primary claimant / number holder (worker)
* keep if bic_puf == "A"

* Q: which date of entitlement do we go with?
* From documentation: 
*   doei_puf: [almost always] gives the earliest month ofentitlement to SSDI benefits and is considered to be the enrollment date.
*   bdoe_start_puf1: indicates when a beneficiary’s eligibility for SSDI began, where n denotes the occurrence
desc *doe*
foreach dt of varlist *doe* {
  di "doei_puf <= `dt'"
  count if doei_puf > `dt' & !missing(doei_puf)
  assert r(N) <= 1
}
* High agreement between bdoe_start_puf1 and doei_puf, even among obs where either has a non-missing value
count if bdoe_start_puf1 == doei_puf
assert inrange(r(N)/_N, 0.97, 0.98)
count if bdoe_start_puf1 == doei_puf & (!missing(bdoe_start_puf1) | !missing(doei_puf))
local n_equal = r(N)
count if !missing(bdoe_start_puf1) | !missing(doei_puf)
local n_nomiss = r(N)
assert inrange(`n_equal'/`n_nomiss', 0.96, 0.97)

* Use the 1st occurrence of SSDI eligibility (bdoe_start_puf1), since it has an associated filing date variable
keep if !missing(bdoe_start_puf1)
assert !missing(bdof_puf1)

* Keep select variables
keep pufpin dobbest_puf dodbest_puf bic_puf bdoe_start_puf1 bdof_puf1 doei_puf ssdi_onset_puf

* Merge in Medicare start dates
merge 1:1 pufpin using "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann_summary.dta", assert(match master using) keep(match master) nogen noreport

* QC: Nobody is seen with Medicare coverage start date after death
assert covstart_month <= mofd(dodbest_puf) if !missing(covstart_month)

* Predict covstart_month (Medicare covg start) based on DIB entitlement and age
if 1 {
  * the month in which individual turns into age 65
  gen date_65yr = mofd(dobbest_puf) + 65 * 12
  
  * the month in which 24-month waiting period after DIB starts
  gen date_24mo = mofd(bdoe_start_puf1) + 24
  
  * the month in which individual turns into age 20
  gen date_20yr = mofd(dobbest_puf) + 20 * 12
  * Predicted Medicare start, if no mortality
  *   Start after 24 month waiting period, but not before age 20 or after age 65
  gen covstart_month_pred = max(min(date_65yr, date_24mo), date_20yr) 
  
  *   Set to missing if individual died before reaching the predicted Medicare start month
  replace covstart_month_pred = . if (mofd(dodbest_puf) < covstart_month_pred)
  *   Set to missing if predicted date is later than the max realized Medicare start month
  sum covstart_month
  replace covstart_month_pred = . if (r(max) < covstart_month_pred)
}
*   Over 90% accuracy for 1994m2+, over 93% accuracy for 1994m2 - 1996m12
count if (covstart_month_pred == covstart_month) & (covstart_month >= mofd(mdy(2, 1, 1994))) & (covstart_month <= mofd(mdy(12, 1, 1996)))
local n_equal = r(N)
count if (covstart_month >= mofd(mdy(2, 1, 1994))) & (covstart_month <= mofd(mdy(12, 1, 1996)))
assert inrange(`n_equal'/r(N), 0.93, 0.94)

* Because annual files (from which 1st mo of Medicare covg is determined) start in 1994, we can't directly measure covg start for 1993m1 - 1994m1
*   Replace covstart_month with predicted value for 1993m1 - 1994m1
gen covstart_month_flag = (covstart_month <= mofd(mdy(1, 1, 1994)))
replace covstart_month = covstart_month_pred if (covstart_month_flag == 1)
drop covstart_month_pred
label var covstart_month_flag "Indicates that covstart_month is predicted based on age and eligibility rules, not observed"

* Age at Medicare coverage start
gen age_mofd_covstart_ssdi = (covstart_month - mofd(dobbest_puf)) / 12
gen age_year_covstart_ssdi = floor(age_mofd_covstart_ssdi)
label var age_mofd_covstart_ssdi "(Age in months)/12 at Medicare coverage start (SSA DAF PUF)"
label var age_year_covstart_ssdi "Age in years at Medicare coverage start (SSA DAF PUF)"

* QC: nearly all gained Medicare between ages of 20-65
count if inrange(age_year_covstart_ssdi, 20, 65)
local n_inrange = r(N)
count if !missing(covstart_month)
assert inrange(`n_inrange'/r(N), 0.9990, 0.9999)

* Keep sample of individuals who are
*   1) between ages 20-64 at time of covstart, based on covstart_month
*   2) gained eligibility in sample period 1993-2017 
* Parallels the approach used in the Medicare cleaning script
keep if inrange(age_year_covstart_ssdi, 20, 64) 
keep if inrange(year(dofm(covstart_month)), 1993, 2017)
tab covstart_month

* Sort and save
bys pufpin: assert _N == 1
compress
save "$SSDIMed/data/proc/ssa/DAF18/daf18_puf_ann_dmg_sample.dta", replace




** EOF
